Exploring Dataset

# import pandas
import pandas as pd 
# read a dataset 
df = pd.read_csv("../data/framingham.csv")

Tail

# tail: by default shows last 5 rows 
df.tail() 
male age education currentSmoker cigsPerDay BPMeds prevalentStroke prevalentHyp diabetes totChol sysBP diaBP BMI heartRate glucose TenYearCHD
4235 0 48 2.0 1 20.0 NaN 0 0 0 248.0 131.0 72.0 22.00 84.0 86.0 0
4236 0 44 1.0 1 15.0 0.0 0 0 0 210.0 126.5 87.0 19.16 86.0 NaN 0
4237 0 52 2.0 0 0.0 0.0 0 0 0 269.0 133.5 83.0 21.47 80.0 107.0 0
4238 1 40 3.0 0 0.0 0.0 0 1 0 185.0 141.0 98.0 25.60 67.0 72.0 0
4239 0 39 3.0 1 30.0 0.0 0 0 0 196.0 133.0 86.0 20.91 85.0 80.0 0
# tail(n); n=1, 2, 3, 4...
df.tail(8)
male age education currentSmoker cigsPerDay BPMeds prevalentStroke prevalentHyp diabetes totChol sysBP diaBP BMI heartRate glucose TenYearCHD
4232 1 68 1.0 0 0.0 0.0 0 1 0 176.0 168.0 97.0 23.14 60.0 79.0 1
4233 1 50 1.0 1 1.0 0.0 0 1 0 313.0 179.0 92.0 25.97 66.0 86.0 1
4234 1 51 3.0 1 43.0 0.0 0 0 0 207.0 126.5 80.0 19.71 65.0 68.0 0
4235 0 48 2.0 1 20.0 NaN 0 0 0 248.0 131.0 72.0 22.00 84.0 86.0 0
4236 0 44 1.0 1 15.0 0.0 0 0 0 210.0 126.5 87.0 19.16 86.0 NaN 0
4237 0 52 2.0 0 0.0 0.0 0 0 0 269.0 133.5 83.0 21.47 80.0 107.0 0
4238 1 40 3.0 0 0.0 0.0 0 1 0 185.0 141.0 98.0 25.60 67.0 72.0 0
4239 0 39 3.0 1 30.0 0.0 0 0 0 196.0 133.0 86.0 20.91 85.0 80.0 0

Columns Names

# Columns 
df.columns
Index(['male', 'age', 'education', 'currentSmoker', 'cigsPerDay', 'BPMeds',
       'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP',
       'diaBP', 'BMI', 'heartRate', 'glucose', 'TenYearCHD'],
      dtype='object')

Observations and Variables(Rows and Columns)

# shape(rows x columns)
df.shape
(4240, 16)

Data Types

# check datatypes 
df.dtypes
male                 int64
age                  int64
education          float64
currentSmoker        int64
cigsPerDay         float64
BPMeds             float64
prevalentStroke      int64
prevalentHyp         int64
diabetes             int64
totChol            float64
sysBP              float64
diaBP              float64
BMI                float64
heartRate          float64
glucose            float64
TenYearCHD           int64
dtype: object

Basic Information

# info: it gives an overview of datasets 
df.info() 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4240 entries, 0 to 4239
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   male             4240 non-null   int64  
 1   age              4240 non-null   int64  
 2   education        4135 non-null   float64
 3   currentSmoker    4240 non-null   int64  
 4   cigsPerDay       4211 non-null   float64
 5   BPMeds           4187 non-null   float64
 6   prevalentStroke  4240 non-null   int64  
 7   prevalentHyp     4240 non-null   int64  
 8   diabetes         4240 non-null   int64  
 9   totChol          4190 non-null   float64
 10  sysBP            4240 non-null   float64
 11  diaBP            4240 non-null   float64
 12  BMI              4221 non-null   float64
 13  heartRate        4239 non-null   float64
 14  glucose          3852 non-null   float64
 15  TenYearCHD       4240 non-null   int64  
dtypes: float64(9), int64(7)
memory usage: 530.1 KB

Numerical Summary of a Dataset

# describe: it gives summary statistics or five number summary 
df.describe() 
male age education currentSmoker cigsPerDay BPMeds prevalentStroke prevalentHyp diabetes totChol sysBP diaBP BMI heartRate glucose TenYearCHD
count 4240.000000 4240.000000 4135.000000 4240.000000 4211.000000 4187.000000 4240.000000 4240.000000 4240.000000 4190.000000 4240.000000 4240.000000 4221.000000 4239.000000 3852.000000 4240.000000
mean 0.429245 49.580189 1.979444 0.494104 9.005937 0.029615 0.005896 0.310613 0.025708 236.699523 132.354599 82.897759 25.800801 75.878981 81.963655 0.151887
std 0.495027 8.572942 1.019791 0.500024 11.922462 0.169544 0.076569 0.462799 0.158280 44.591284 22.033300 11.910394 4.079840 12.025348 23.954335 0.358953
min 0.000000 32.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 107.000000 83.500000 48.000000 15.540000 44.000000 40.000000 0.000000
25% 0.000000 42.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 206.000000 117.000000 75.000000 23.070000 68.000000 71.000000 0.000000
50% 0.000000 49.000000 2.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 234.000000 128.000000 82.000000 25.400000 75.000000 78.000000 0.000000
75% 1.000000 56.000000 3.000000 1.000000 20.000000 0.000000 0.000000 1.000000 0.000000 263.000000 144.000000 90.000000 28.040000 83.000000 87.000000 0.000000
max 1.000000 70.000000 4.000000 1.000000 70.000000 1.000000 1.000000 1.000000 1.000000 696.000000 295.000000 142.500000 56.800000 143.000000 394.000000 1.000000
# transpose table
df.describe().T
count mean std min 25% 50% 75% max
male 4240.0 0.429245 0.495027 0.00 0.00 0.0 1.00 1.0
age 4240.0 49.580189 8.572942 32.00 42.00 49.0 56.00 70.0
education 4135.0 1.979444 1.019791 1.00 1.00 2.0 3.00 4.0
currentSmoker 4240.0 0.494104 0.500024 0.00 0.00 0.0 1.00 1.0
cigsPerDay 4211.0 9.005937 11.922462 0.00 0.00 0.0 20.00 70.0
BPMeds 4187.0 0.029615 0.169544 0.00 0.00 0.0 0.00 1.0
prevalentStroke 4240.0 0.005896 0.076569 0.00 0.00 0.0 0.00 1.0
prevalentHyp 4240.0 0.310613 0.462799 0.00 0.00 0.0 1.00 1.0
diabetes 4240.0 0.025708 0.158280 0.00 0.00 0.0 0.00 1.0
totChol 4190.0 236.699523 44.591284 107.00 206.00 234.0 263.00 696.0
sysBP 4240.0 132.354599 22.033300 83.50 117.00 128.0 144.00 295.0
diaBP 4240.0 82.897759 11.910394 48.00 75.00 82.0 90.00 142.5
BMI 4221.0 25.800801 4.079840 15.54 23.07 25.4 28.04 56.8
heartRate 4239.0 75.878981 12.025348 44.00 68.00 75.0 83.00 143.0
glucose 3852.0 81.963655 23.954335 40.00 71.00 78.0 87.00 394.0
TenYearCHD 4240.0 0.151887 0.358953 0.00 0.00 0.0 0.00 1.0
# for specific column 
df['age'].describe() 
count    4240.000000
mean       49.580189
std         8.572942
min        32.000000
25%        42.000000
50%        49.000000
75%        56.000000
max        70.000000
Name: age, dtype: float64
# for multiple columns 
df[['age', 'BMI']].describe() 
age BMI
count 4240.000000 4221.000000
mean 49.580189 25.800801
std 8.572942 4.079840
min 32.000000 15.540000
25% 42.000000 23.070000
50% 49.000000 25.400000
75% 56.000000 28.040000
max 70.000000 56.800000

Exploring Series

# read another dataset 
titanic = pd.read_csv('http://bit.ly/kaggletrain')
# examine first few rows 
titanic.head() 

Value Counts

# value_counts()
titanic['Sex'].value_counts() 
# value_counts() in percent
titanic['Sex'].value_counts(normalize=True) 
# returns a series 
type(titanic['Sex'].value_counts(normalize=True))

Unique()

# unique() 
titanic['Fare'].unique() 
# return a numpy.ndarray
type(titanic['Fare'].unique())

Cross Tabulation

# crosstab 
pd.crosstab(titanic['Sex'], titanic['Survived'])

Describe

# describe a categorical column 
titanic['Age'].describe() 

Basic Statistics

# mean()
titanic.Age.mean() 
# max()
titanic.Age.max() 
# min() 
titanic.Age.min() 
# median() 
titanic.Age.median() 

Visualization

%matplotlib inline 
# barplot
titanic.Sex.value_counts().plot(kind="bar") 
# histogram
titanic.Age.plot(kind="hist")